# -*- coding:UTF-8 -*-
import requests
import re
from bs4 import BeautifulSoup

def ancientbooks():
    target = 'https://so.gushiwen.cn/guwen/'
    #target = 'https://so.gushiwen.cn/guwen/Default.aspx?p=1&type=%e6%98%93%e7%b1%bb'
    req = requests.get(url = target)
    html =req.text
    bf = BeautifulSoup(html)
    #<div class="cont">
    texts = bf.find_all('div',class_ = 'cont')
    url = []
    booktitle = []
    booktitle_new = []
    ancientbooks_url = []
    for urls in texts[11]:
        a = str(urls)[23:24]
        b = str(urls)[22:23]
        #print(b)
        if a == str(4):
            url.append("https://so.gushiwen.cn/"+str(urls)[10:29])
            title = re.findall('.aspx">(.*?)</a>',str(urls))
            booktitle.append(title)
        else:
            if b == '.':
                url.append("https://so.gushiwen.cn/" + str(urls)[10:27])
                title = re.findall('.aspx">(.*?)</a>',str(urls))
                booktitle.append(title)
            else:
                url.append("https://so.gushiwen.cn/" + str(urls)[10:28])
                title = re.findall('.aspx">(.*?)</a>',str(urls))
                booktitle.append(title)
    for c in url:
        if c != 'https://so.gushiwen.cn/':
            ancientbooks_url.append(c)
    #for i in booktitle:
        #if i != ' ':
            #booktitle_new.append(i)
    return ancientbooks_url,booktitle


def chapter(ancientbooks_urls_new):
    chapter_url = []
    target = ancientbooks_urls_new
    #for target in ancientbooks_urls:
        #chapter_url = "list%s" % f
        #print(target)
    req = requests.get(url=target)
    chapter = re.findall('<span><a href="(.*?)</a></span>',req.text,re.S)
    for d in chapter:
        chapter_url.append("https://so.gushiwen.cn/" +d[0:50])
        #print(chapter_url)
    return chapter_url

def content(chapter_urls,   ancientbooks):
    #target = 'https://so.gushiwen.cn//guwen/bookv_46653FD803893E4F699E8628DEAEE3C0.aspx'
    #f = open('C:\\Users\\Administrator\\Desktop\\shuji\\' + ancientbooks + '.txt', 'a+', encoding='utf-8')
    f = open('E:\\PythonWorkstation\\books\\' + ancientbooks + '.txt', 'a+', encoding='utf-8')
    for target in chapter_urls:
        req = requests.get(url=target)
        l = re.findall('<head>(.*?)</title>', req.text)
        f.write(str(l))
        content = re.findall('<div class="contson">(.*?)</div>', req.text, re.S)
        for e in content:
            #f = open('C:\Users\GZHY\Desktop\guji1' + ancientbooks + '.txt', 'a+',encoding='utf-8')
            #k = re.findall('<p>　　(.*?)</p>', e)
            f.write(e)
            #f.close()
    f.close()
    #f = open('D:/guji/a.txt', 'r')
    #print(f.read())



ancientbooks_urls = []
chapter_urls = []
booktitle = []
ancientbooks()
ancientbooks_urls,booktitle = ancientbooks()
#print(ancientbooks_urls)
g = 1
j = 1
for f in ancientbooks_urls:
    ancientbooks = str(booktitle[j])
    ancientbooks_urls_new = f
    chapter_urls = chapter(ancientbooks_urls_new)
    content(chapter_urls,   ancientbooks)
    g = g + 1
    j = j + 2